In [61]:
# PACKAGES REQUIRED
import pandas as pd
import numpy as np
from tableauscraper import TableauScraper as TS
from dash import Dash, html, Input, Output, dash_table
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
import sklearn as sk
import sklearn.linear_model
import random
import seaborn as sns
from scipy import stats
In [ ]:
# The repository link is below. SeolHaeJuan = Jared, for clarification.
# https://github.com/SeolHaeJuan/STA-141B-Project.git
In [475]:
# Only 2021 for data
url = "https://visualizedata.ucop.edu/t/Public/views/AdmissionsDataTable/TREthbyYr?:embed_code_version=3&:embed=y&:loadOrderID=0&:display_spinner=no&:showAppBanner=false&:display_count=n&:showVizHome=n&:origin=viz_share_link"
ts = TS()
ts.loads(url)
workbook = ts.getWorkbook()

for t in workbook.worksheets:
    print(f"worksheet name : {t.name}") #show worksheet name
    print(t.data) #show dataframe for this worksheet
worksheet name : TR by Year
               School-value           School-alias         Calculation1-value  \
0     ALLAN HANCOCK COLLEGE  ALLAN HANCOCK COLLEGE  ALLAN HANCOCK COLLEGE4002   
1     ALLAN HANCOCK COLLEGE  ALLAN HANCOCK COLLEGE  ALLAN HANCOCK COLLEGE4002   
2     ALLAN HANCOCK COLLEGE  ALLAN HANCOCK COLLEGE  ALLAN HANCOCK COLLEGE4002   
3     ALLAN HANCOCK COLLEGE  ALLAN HANCOCK COLLEGE  ALLAN HANCOCK COLLEGE4002   
4     ALLAN HANCOCK COLLEGE  ALLAN HANCOCK COLLEGE  ALLAN HANCOCK COLLEGE4002   
...                     ...                    ...                        ...   
2767           YUBA COLLEGE           YUBA COLLEGE           YUBA COLLEGE4994   
2768           YUBA COLLEGE           YUBA COLLEGE           YUBA COLLEGE4994   
2769           YUBA COLLEGE           YUBA COLLEGE           YUBA COLLEGE4994   
2770           YUBA COLLEGE           YUBA COLLEGE           YUBA COLLEGE4994   
2771           YUBA COLLEGE           YUBA COLLEGE           YUBA COLLEGE4994   

             Calculation1-alias   County-value   County-alias   City-value  \
0     ALLAN HANCOCK COLLEGE4002  Santa Barbara  Santa Barbara  Santa Maria   
1     ALLAN HANCOCK COLLEGE4002  Santa Barbara  Santa Barbara  Santa Maria   
2     ALLAN HANCOCK COLLEGE4002  Santa Barbara  Santa Barbara  Santa Maria   
3     ALLAN HANCOCK COLLEGE4002  Santa Barbara  Santa Barbara  Santa Maria   
4     ALLAN HANCOCK COLLEGE4002  Santa Barbara  Santa Barbara  Santa Maria   
...                         ...            ...            ...          ...   
2767           YUBA COLLEGE4994           Yuba           Yuba   Marysville   
2768           YUBA COLLEGE4994           Yuba           Yuba   Marysville   
2769           YUBA COLLEGE4994           Yuba           Yuba   Marysville   
2770           YUBA COLLEGE4994           Yuba           Yuba   Marysville   
2771           YUBA COLLEGE4994           Yuba           Yuba   Marysville   

       City-alias Count-value Count-alias Uad Uc Ethn 6 Cat-value  \
0     Santa Maria         Enr         Enr           International   
1     Santa Maria         Adm         Adm           International   
2     Santa Maria         App         App           International   
3     Santa Maria         Enr         Enr                 Unknown   
4     Santa Maria         Enr         Enr                 Unknown   
...           ...         ...         ...                     ...   
2767   Marysville         Adm         Adm        African American   
2768   Marysville         App         App        African American   
2769   Marysville         Enr         Enr                     All   
2770   Marysville         Adm         Adm                     All   
2771   Marysville         App         App                     All   

     Uad Uc Ethn 6 Cat-alias SUM(Pivot Field Values)-alias  
0            Inter- national                        %null%  
1            Inter- national                             3  
2            Inter- national                             5  
3           Domestic unknown                        %null%  
4           Domestic unknown                        %null%  
...                      ...                           ...  
2767        African American                             3  
2768        African American                             4  
2769                     All                            40  
2770                     All                            52  
2771                     All                            64  

[2772 rows x 13 columns]
In [476]:
mainframe = t.data
In [477]:
mainframe.drop(mainframe.columns[[1, 2, 3, 4, 7, 8, 11]], axis=1, inplace=True)
In [478]:
mainframe.rename(columns = {'School-value': 'School', 'County-alias': 'County', 'City-value': 'City', 'Count-alias': 'Type', 'Uad Uc Ethn 6 Cat-value': 'Eth','SUM(Pivot Field Values)-alias':'Value'}, inplace = True)
mainframe
Out[478]:
School County City Type Eth Value
0 ALLAN HANCOCK COLLEGE Santa Barbara Santa Maria Enr International %null%
1 ALLAN HANCOCK COLLEGE Santa Barbara Santa Maria Adm International 3
2 ALLAN HANCOCK COLLEGE Santa Barbara Santa Maria App International 5
3 ALLAN HANCOCK COLLEGE Santa Barbara Santa Maria Enr Unknown %null%
4 ALLAN HANCOCK COLLEGE Santa Barbara Santa Maria Enr Unknown %null%
... ... ... ... ... ... ...
2767 YUBA COLLEGE Yuba Marysville Adm African American 3
2768 YUBA COLLEGE Yuba Marysville App African American 4
2769 YUBA COLLEGE Yuba Marysville Enr All 40
2770 YUBA COLLEGE Yuba Marysville Adm All 52
2771 YUBA COLLEGE Yuba Marysville App All 64

2772 rows × 6 columns

In [479]:
mainframe.loc[mainframe["Value"] == "%null%", "Value"] = 0
In [498]:
# Dataframe subsetting
enr_frame = mainframe.loc[(mainframe['Type'] == 'Enr') & (mainframe['Eth'] != 'All')]
adm_frame = mainframe.loc[(mainframe['Type'] == 'Adm') & (mainframe['Eth'] != 'All')]
app_frame = mainframe.loc[(mainframe['Type'] == 'App') & (mainframe['Eth'] != 'All')]
tot_frame = mainframe[mainframe['Eth'] == 'All']

# Resetting index so we can properly divide the columns later
enr_frame = enr_frame.reset_index()
adm_frame = adm_frame.reset_index()
app_frame = app_frame.reset_index()
In [499]:
# Getting rid of index col
enr_frame.drop(enr_frame.columns[[0]],axis=1,inplace=True) 
adm_frame.drop(adm_frame.columns[[0]], axis = 1, inplace = True)
app_frame.drop(app_frame.columns[[0]],axis=1,inplace=True)

# Switching to float for division later
app_frame['Value'] = pd.to_numeric(app_frame['Value'])
adm_frame['Value'] = pd.to_numeric(adm_frame['Value']) # Focusing on this frame
enr_frame['Value'] = pd.to_numeric(adm_frame['Value'])
In [401]:
tot_frame[tot_frame.Type == 'Adm'].sort_values(by=['Value'], ascending = False)
In [500]:
# Rate column
pd.set_option('mode.use_inf_as_na', True)
app_frame['Value'] = pd.to_numeric(app_frame['Value'])
adm_frame['Value'] = pd.to_numeric(adm_frame['Value'])
adm_frame['Rate'] = adm_frame['Value']/app_frame['Value']
app_frame['Rate'] = adm_frame['Value']/app_frame['Value']
adm_frame = adm_frame.fillna(0)
app_frame = app_frame.fillna(0)
In [501]:
# Getting rid of unknowns. Better to do it after we calculated rate to not mess with positioning
adm_frame = adm_frame[adm_frame['Eth']!= 'Unknown']
app_frame = app_frame[app_frame['Eth']!= 'Unknown']
enr_frame = enr_frame[enr_frame['Eth']!= 'Unknown']
In [685]:
adm_frame.loc[adm_frame['Rate'] == 1.0].head(5)
Out[685]:
School County City Type Eth Value Rate Binary Rate
43 BERKELEY CITY COLLEGE Alameda Berkeley Adm American Indian 3 1.0 1
45 BUTTE COLLEGE Butte Oroville Adm International 6 1.0 1
48 BUTTE COLLEGE Butte Oroville Adm Asian 11 1.0 1
57 CABRILLO COLLEGE Santa Cruz Aptos Adm American Indian 4 1.0 1
129 COLLEGE OF MARIN Marin Kentfield Adm International 4 1.0 1
In [503]:
adm_frame.loc[adm_frame['Rate'] == 1.0]['Eth'].value_counts()
Out[503]:
International       11
American Indian      5
African American     4
Asian                2
Chicano/Latino       2
White                1
Name: Eth, dtype: int64
In [504]:
sorty = adm_frame.sort_values('Rate', ascending=False) # Highest admit rate schools per eth.
sorty[sorty['Value'] > 10].head(10)
Out[504]:
School County City Type Eth Value Rate
48 BUTTE COLLEGE Butte Oroville Adm Asian 11 1.000000
728 SIERRA COLLEGE Placer Rocklin Adm African American 11 1.000000
133 COLLEGE OF MARIN Marin Kentfield Adm Asian 19 0.950000
477 MIRACOSTA COLLEGE San Diego Oceanside Adm International 33 0.942857
120 COASTLINE COMMUNITY COLLEGE Orange Fountain Valley Adm Chicano/Latino 16 0.941176
507 MOORPARK COLLEGE Ventura Moorpark Adm International 15 0.937500
675 SANTA ANA COLLEGE Orange Santa Ana Adm International 13 0.928571
785 WEST LOS ANGELES COLLEGE Los Angeles Culver City Adm International 38 0.926829
672 SAN JOSE CITY COLLEGE Santa Clara San Jose Adm Asian 24 0.923077
128 COLLEGE OF ALAMEDA Alameda Alameda Adm African American 12 0.923077
In [715]:
# GPA
url = "https://visualizedata.ucop.edu/t/Public/views/AdmissionsDataTable/TRGPAbyYr?:embed_code_version=3&:embed=y&:loadOrderID=0&:display_spinner=no&:showAppBanner=false&:display_count=n&:showVizHome=n&:origin=viz_share_link"
ts = TS()
ts.loads(url)
workbook = ts.getWorkbook()

for t in workbook.worksheets:
    print(f"worksheet name : {t.name}") #show worksheet name
    print(t.data) #show dataframe for this worksheet
    
gpa = t.data
gpa.drop(gpa.columns[[1, 2, 4, 5, 6, 8]], axis=1, inplace=True)
gpa.rename(columns = {'School-value': 'School', 'City-alias': 'City', 'County-alias': 'County', 'Measure Names-alias': 'Type', 'Measure Values-alias':'GPA'}, inplace = True)
worksheet name : TR GPA by Year
                   School-value                School-alias   City-value  \
0         ALLAN HANCOCK COLLEGE       ALLAN HANCOCK COLLEGE  Santa Maria   
1         ALLAN HANCOCK COLLEGE       ALLAN HANCOCK COLLEGE  Santa Maria   
2         ALLAN HANCOCK COLLEGE       ALLAN HANCOCK COLLEGE  Santa Maria   
3        AMERICAN RIVER COLLEGE      AMERICAN RIVER COLLEGE   Sacramento   
4        AMERICAN RIVER COLLEGE      AMERICAN RIVER COLLEGE   Sacramento   
..                          ...                         ...          ...   
331  WOODLAND COMMUNITY COLLEGE  WOODLAND COMMUNITY COLLEGE     Woodland   
332  WOODLAND COMMUNITY COLLEGE  WOODLAND COMMUNITY COLLEGE     Woodland   
333                YUBA COLLEGE                YUBA COLLEGE   Marysville   
334                YUBA COLLEGE                YUBA COLLEGE   Marysville   
335                YUBA COLLEGE                YUBA COLLEGE   Marysville   

      City-alias                Calculation1-value  \
0    Santa Maria       ALLAN HANCOCK COLLEGE004002   
1    Santa Maria       ALLAN HANCOCK COLLEGE004002   
2    Santa Maria       ALLAN HANCOCK COLLEGE004002   
3     Sacramento      AMERICAN RIVER COLLEGE004004   
4     Sacramento      AMERICAN RIVER COLLEGE004004   
..           ...                               ...   
331     Woodland  WOODLAND COMMUNITY COLLEGE005762   
332     Woodland  WOODLAND COMMUNITY COLLEGE005762   
333   Marysville                YUBA COLLEGE004994   
334   Marysville                YUBA COLLEGE004994   
335   Marysville                YUBA COLLEGE004994   

                   Calculation1-alias   County-value   County-alias  \
0         ALLAN HANCOCK COLLEGE004002  Santa Barbara  Santa Barbara   
1         ALLAN HANCOCK COLLEGE004002  Santa Barbara  Santa Barbara   
2         ALLAN HANCOCK COLLEGE004002  Santa Barbara  Santa Barbara   
3        AMERICAN RIVER COLLEGE004004     Sacramento     Sacramento   
4        AMERICAN RIVER COLLEGE004004     Sacramento     Sacramento   
..                                ...            ...            ...   
331  WOODLAND COMMUNITY COLLEGE005762           Yolo           Yolo   
332  WOODLAND COMMUNITY COLLEGE005762           Yolo           Yolo   
333                YUBA COLLEGE004994           Yuba           Yuba   
334                YUBA COLLEGE004994           Yuba           Yuba   
335                YUBA COLLEGE004994           Yuba           Yuba   

                                   Measure Names-value Measure Names-alias  \
0    [federated.11exkwi1b9bzff10j00kr0sac218].[sum:...            Enrl GPA   
1    [federated.11exkwi1b9bzff10j00kr0sac218].[sum:...             Adm GPA   
2    [federated.11exkwi1b9bzff10j00kr0sac218].[sum:...             App GPA   
3    [federated.11exkwi1b9bzff10j00kr0sac218].[sum:...            Enrl GPA   
4    [federated.11exkwi1b9bzff10j00kr0sac218].[sum:...             Adm GPA   
..                                                 ...                 ...   
331  [federated.11exkwi1b9bzff10j00kr0sac218].[sum:...             Adm GPA   
332  [federated.11exkwi1b9bzff10j00kr0sac218].[sum:...             App GPA   
333  [federated.11exkwi1b9bzff10j00kr0sac218].[sum:...            Enrl GPA   
334  [federated.11exkwi1b9bzff10j00kr0sac218].[sum:...             Adm GPA   
335  [federated.11exkwi1b9bzff10j00kr0sac218].[sum:...             App GPA   

    Measure Values-alias  
0                   3.45  
1                   3.43  
2                   3.34  
3                   3.56  
4                   3.53  
..                   ...  
331                 3.59  
332                 3.41  
333                 3.58  
334                 3.53  
335                 3.40  

[336 rows x 11 columns]
In [716]:
# Subsetting GPA so we only get the one's we want
enr_gpa = gpa[gpa.Type == 'Enrl GPA']
adm_gpa = gpa[gpa.Type == 'Adm GPA']
app_gpa = gpa[gpa.Type == 'App GPA']
adm_gpa = adm_gpa[['School', 'County', 'City', 'Type', 'GPA']]
adm_gpa['Type'] = adm_gpa['Type'].str.replace('GPA', '')

df_all = adm_gpa.merge(adm_max.drop_duplicates(), on=['School', 'County', 'City'], 
                   how='left', indicator=True)

df_all.drop(df_all.columns[[3, 9]], axis=1, inplace=True)
df_all.rename(columns = {'Type_y': 'Type'}, inplace = True)

final_frame = df_all
In [717]:
final_frame.Eth.value_counts() 
# This is the count of what ethnicity each college's max amount of admits is equal to
Out[717]:
Chicano/Latino    51
White             30
Asian             26
International      4
Name: Eth, dtype: int64
In [718]:
final_frame.loc[final_frame['Rate'] == 1.0]['Eth'].value_counts() # only one school with 100% admit rate
Out[718]:
Chicano/Latino    1
Name: Eth, dtype: int64
In [719]:
final_frame.loc[final_frame['Rate'] == 1.0] # Not many people transferred from here
Out[719]:
School County City GPA Type Eth Value Rate
106 WEST HILLS COLLEGE COALINGA Fresno Coalinga 3.57 Adm Chicano/Latino 4.0 1.0
In [713]:
final_frame = adm_frame.sort_values('Value', ascending=False).drop_duplicates('School').sort_index()
# final_frame is the high for each school, where eth is the max amount of students admitted
In [690]:
# Sorting by descending values. These schools had the most students admitted. 
final_frame.sort_values('Value', ascending = False)
Out[690]:
School County City Type Eth Value Rate Binary Rate
589 PASADENA CITY COLLEGE Los Angeles Pasadena Adm Asian 588 0.790323 1
242 DE ANZA COLLEGE Santa Clara Cupertino Adm Asian 566 0.782849 1
693 SANTA MONICA COLLEGE Los Angeles Santa Monica Adm White 492 0.772370 1
685 SANTA BARBARA CITY COLLEGE Santa Barbara Santa Barbara Adm White 473 0.828371 1
616 RIVERSIDE CITY COLLEGE Riverside Riverside Adm Chicano/Latino 352 0.733333 1
... ... ... ... ... ... ... ... ...
199 COPPER MOUNTAIN COLLEGE San Bernardino Joshua Tree Adm White 7 0.875000 1
278 FEATHER RIVER COLLEGE Plumas Quincy Adm White 4 0.666667 1
779 WEST HILLS COLLEGE COALINGA Fresno Coalinga Adm Chicano/Latino 4 1.000000 1
177 COLLEGE OF THE SISKIYOUS Siskiyou Weed Adm Chicano/Latino 3 0.600000 1
385 LASSEN COLLEGE Lassen Susanville Adm White 0 0.000000 0

113 rows × 8 columns

In [692]:
one_outs = adm_frame.loc[adm_frame['Rate'] == 1.0]
inters = one_outs[one_outs['Eth'] == 'International']['School'] # The 11 schools with 100% international student admittance
result1 = final_frame[final_frame['School'].isin(list(inters))]
result1.head(20)
# From this result, we see that all the schools with a 100% international student admit rate, do not have international students as their ethnicity with max admittance. 
Out[692]:
School County City Type Eth Value Rate Binary Rate
47 BUTTE COLLEGE Butte Oroville Adm White 29 0.763158 1
132 COLLEGE OF MARIN Marin Kentfield Adm White 69 0.683168 1
208 COSUMNES RIVER COLLEGE Sacramento Sacramento Adm Asian 89 0.729508 1
283 FOLSOM LAKE COLLEGE Sacramento Folsom Adm White 84 0.770642 1
407 LOS ANGELES HARBOR COLLEGE Los Angeles Wilmington Adm Chicano/Latino 54 0.701299 1
467 MERCED COLLEGE Merced Merced Adm Chicano/Latino 90 0.671642 1
474 MERRITT COLLEGE Alameda Oakland Adm Chicano/Latino 21 0.777778 1
488 MISSION COLLEGE Santa Clara Santa Clara Adm Asian 62 0.738095 1
551 NORCO COLLEGE Riverside Norco Adm Chicano/Latino 92 0.686567 1
574 OXNARD COLLEGE Ventura Oxnard Adm Chicano/Latino 55 0.833333 1
763 VENTURA COLLEGE Ventura Ventura Adm Chicano/Latino 112 0.722581 1
In [417]:
# Our averages across all for adm_frame
print("Mean Number of Admitted Students:", adm_frame['Value'].mean())
print("Mean Admit Rate Across All CCs", adm_frame['Rate'].mean())
Mean Number of Admitted Students: 45.096930533117934
Mean Admit Rate Across All CCs 0.6021040155998523
In [ ]:
final_frame.mean(numeric_only = True)
In [693]:
e_list = ['African American', 'American Indian', 'Asian', 'Chicano/Latino', 'International', 'White']
def get_val(x):
    for i in e_list:
        print(x[x['Eth'] == i].mean(numeric_only = True))

get_val(adm_frame)
get_val(final_frame) # First two are zeroes, as they were not in the final frame
Value          13.000000
Rate            0.558193
Binary Rate     0.754545
dtype: float64
Value          1.525000
Rate           0.251741
Binary Rate    0.312500
dtype: float64
Value          66.321429
Rate            0.664115
Binary Rate     0.875000
dtype: float64
Value          75.531532
Rate            0.732823
Binary Rate     0.990991
dtype: float64
Value          33.319149
Rate            0.596481
Binary Rate     0.723404
dtype: float64
Value          66.241071
Rate            0.708647
Binary Rate     0.946429
dtype: float64
Value         NaN
Rate          NaN
Binary Rate   NaN
dtype: float64
Value         NaN
Rate          NaN
Binary Rate   NaN
dtype: float64
Value          164.307692
Rate             0.774025
Binary Rate      1.000000
dtype: float64
Value          76.961538
Rate            0.717070
Binary Rate     1.000000
dtype: float64
Value          112.750000
Rate             0.875353
Binary Rate      1.000000
dtype: float64
Value          128.870968
Rate             0.735716
Binary Rate      0.967742
dtype: float64
In [234]:
# Does not appear to be normally distributed
sns.kdeplot(
   data=adm_frame, x="Rate", hue="Eth",
   fill=True, common_norm=False, palette="crest",
   alpha=.5, linewidth=0,
)
Out[234]:
<AxesSubplot:xlabel='Rate', ylabel='Density'>
In [109]:
# Adm Frame in pretty format
fig = go.Figure(data=[go.Table(
    header=dict(values=list(adm_frame.columns),
                fill_color='cornflowerblue',
                align='left'),
    cells=dict(values=[adm_frame.School, adm_frame.County, adm_frame.City, adm_frame.Type, adm_frame.Eth, adm_frame.Value, adm_frame.Rate],
               fill_color='ivory',
               align='left'))
])
fig.update_layout(
    title_text = "Admit Frame",
    title_font_size=30,
    font_family="Times New Roman",
    font_color="black",
    title_font_family="Times New Roman",
    title_font_color="black",
)
fig.update_traces(cells_font=dict(size = 12))
fig.show()
In [ ]:
 
In [26]:
# Plots
fig = px.box(adm_frame, x="Rate", y="Eth", color="Eth", points = "all",
                 labels={
                     "Rate": "Admittance Rate",
                     "Eth": "Ethnicity",
                 },
                title="Boxplot of Admittance Rate Per Each Ethnicity")
fig.show()
In [231]:
# Changing for outliers, and the fact that some schools have zero from int and native american
nhk = adm_frame[adm_frame['Rate'] > 0.2]
fig = px.box(nhk, x="Rate", y="Eth", color="Eth", points = "all",
                 labels={
                     "Rate": "Admittance Rate",
                     "Eth": "Ethnicity",
                 },
                title="Boxplot of Admittance Rate Per Each Ethnicity")
fig.show()
In [29]:
test_frame = adm_frame.loc[adm_frame['Rate']>0.2]
fig = px.scatter(test_frame, 
                 x = 'Value',
                 y = 'Rate',
                 template = 'plotly_dark',
                 color = 'Eth',
                 trendline = 'ols',
                 title = 'Admit Rate Per Different Ethnicities')
fig.update_layout(showlegend=True)
fig.show()
In [340]:
sns.set(rc={'figure.figsize':(8,10)})
sns.pointplot(data=adm_frame, x="Rate", y="Value", hue="Eth", title = "Point Plot of Admitted Ethnicities")
In [768]:
# Final frame in pretty, interactive format
fig = go.Figure(data=[go.Table(
    header=dict(values=list(final_frame.columns),
                fill_color='lavenderblush',
                align='left'),
    cells=dict(values=[final_frame.School, final_frame.County, final_frame.City, final_frame.GPA, final_frame.Type, final_frame.Eth, final_frame.Value, final_frame.Rate],
               fill_color='lavender',
               align='left'))
])
fig.update_layout(
    title_text = "Final Frame | Each CC's Most Admitted Ethnicity",
    title_font_size=30,
    font_family="Times New Roman",
    font_color="black",
    title_font_family="Times New Roman",
    title_font_color="black",
)
fig.update_traces(cells_font=dict(size = 10))
fig.show()
In [212]:
# Rate is not normally distributed across entire datatset
from scipy import stats
stats.shapiro(adm_frame['Rate'])
Out[212]:
ShapiroResult(statistic=0.7629678249359131, pvalue=6.721139208871741e-29)
In [262]:
# Test for normality of variables, each rate associated is not normal
from scipy.stats import shapiro
for i in e_list:
    print(i, shapiro(adm_frame[adm_frame.Eth == i].Rate),'\n')
African American ShapiroResult(statistic=0.8359304070472717, pvalue=1.0530741834102741e-09) 

American Indian ShapiroResult(statistic=0.638819694519043, pvalue=9.998421361678833e-13) 

Asian ShapiroResult(statistic=0.6992810368537903, pvalue=7.828422155312098e-14) 

Chicano/Latino ShapiroResult(statistic=0.7622840404510498, pvalue=4.056537224184087e-12) 

International ShapiroResult(statistic=0.7731039524078369, pvalue=9.79381784005362e-11) 

White ShapiroResult(statistic=0.6323310136795044, pvalue=2.4369747756228244e-15) 

In [242]:
# TEST FOR DIFFERENCE OF MEANS
from scipy import stats
stats.kruskal(adm_frame[adm_frame.Eth == 'African American'].Rate,
              adm_frame[adm_frame.Eth == 'American Indian'].Rate,
              adm_frame[adm_frame.Eth == 'Asian'].Rate,
             adm_frame[adm_frame.Eth == 'Chicano/Latino'].Rate,
             adm_frame[adm_frame.Eth == 'International'].Rate,
             adm_frame[adm_frame.Eth == 'White'].Rate)

# DIFFERENCE OF MEANS
Out[242]:
KruskalResult(statistic=84.75056525805913, pvalue=8.491057372967863e-17)
In [ ]:
## FOR NORMALITY TEST OF TRANSFORMED FRAME JUST TO SEE
stats.kruskal(minion_frame[minion_frame.Eth == 'African American'].Rate,
              minion_frame[minion_frame.Eth == 'American Indian'].Rate,
              minion_frame[minion_frame.Eth == 'Asian'].Rate,
             minion_frame[minion_frame.Eth == 'Chicano/Latino'].Rate,
             minion_frame[minion_frame.Eth == 'International'].Rate,
             minion_frame[minion_frame.Eth == 'White'].Rate)
In [704]:
### Function to allow us to get the tukey table for what we want to ask. 
def tukey_table(variable):
    gen_title = "Tukey Table for Comparisons of "
    fig = go.Figure(data=[go.Table(
    header=dict(values=list(tukey_sum.columns),
                fill_color='lavenderblush',
                align='left'),
    cells=dict(values=[tukey_sum.group1, tukey_sum.group2, tukey_sum.meandiff, tukey_sum['p-adj'], tukey_sum.lower, tukey_sum.upper, tukey_sum.reject],
               fill_color='lavender',
               align='left'))
    ])
    fig.update_layout(
    title_text = gen_title + variable,
    title_x=0.5,
    title_font_size=30,
    font_family="Times New Roman",
    font_color="black",
    title_font_family="Times New Roman",
    title_font_color="black",
    )
    fig.update_traces(cells_font=dict(size = 10))
    fig.show()
In [705]:
def tukey_grab(frame, variable):
    comp = []
    gru = []
    for i in e_list:
        x = list(frame[frame.Eth == i][variable])
        comp = x + comp
        y = list(np.repeat([i], repeats = len(x)))
        gru = y + gru
    # Creating df for our list of rates and eths
    diff_frame = pd.DataFrame({variable: comp, 'Eth': gru}) 

    # actual tukey test
    tukey = pairwise_tukeyhsd(endog=diff_frame[variable].rank(), # USED RANK FOR NONPARAMETRIC
                          groups=diff_frame['Eth'],
                          alpha=0.05)
    global tukey_sum # Need to make it global so it works on the other function
    tukey_sum = pd.DataFrame(data=tukey._results_table.data[1:], columns=tukey._results_table.data[0])
    tukey_table(variable)
    
    print(tukey)
    # Convert to dataframe for figure
In [706]:
tukey_grab(adm_frame, 'Rate')
           Multiple Comparison of Means - Tukey HSD, FWER=0.05            
==========================================================================
     group1           group2     meandiff p-adj    lower    upper   reject
--------------------------------------------------------------------------
African American American Indian -66.5994 0.0715 -136.4256   3.2267  False
African American           Asian 101.6881 0.0001   37.8979 165.4783   True
African American  Chicano/Latino 110.7976    0.0   46.8652   174.73   True
African American   International 112.0691    0.0   45.3212 178.8169   True
African American           White 112.4872    0.0    48.697 176.2774   True
 American Indian           Asian 168.2875    0.0   98.7243 237.8507   True
 American Indian  Chicano/Latino  177.397    0.0  107.7034 247.0906   True
 American Indian   International 178.6685    0.0  106.3834 250.9535   True
 American Indian           White 179.0866    0.0  109.5234 248.6498   True
           Asian  Chicano/Latino   9.1095 0.9985  -54.5355  72.7546  False
           Asian   International   10.381 0.9978  -56.0917  76.8537  False
           Asian           White  10.7991 0.9966  -52.7031  74.3013  False
  Chicano/Latino   International   1.2715    1.0  -65.3377  67.8807  False
  Chicano/Latino           White   1.6896    1.0  -61.9555  65.3346  False
   International           White   0.4181    1.0  -66.0546  66.8908  False
--------------------------------------------------------------------------
In [707]:
## DIFF IN VALUE
tukey_grab(adm_frame, 'Value')
            Multiple Comparison of Means - Tukey HSD, FWER=0.05             
============================================================================
     group1           group2      meandiff p-adj    lower     upper   reject
----------------------------------------------------------------------------
African American American Indian -146.6199    0.0   -204.47  -88.7698   True
African American           Asian  133.3953    0.0   80.5459  186.2447   True
African American  Chicano/Latino  209.5305    0.0  156.5633  262.4977   True
African American   International   -0.6296    1.0  -55.9294   54.6702  False
African American           White  144.9221    0.0   92.0727  197.7715   True
 American Indian           Asian  280.0152    0.0   222.383  337.6474   True
 American Indian  Chicano/Latino  356.1504    0.0  298.4101  413.8907   True
 American Indian   International  145.9903    0.0    86.103  205.8776   True
 American Indian           White   291.542    0.0  233.9097  349.1742   True
           Asian  Chicano/Latino   76.1352 0.0006   23.4061  128.8644   True
           Asian   International -134.0249    0.0 -189.0967  -78.9531   True
           Asian           White   11.5268 0.9891   -41.084   64.1376  False
  Chicano/Latino   International -210.1601    0.0  -265.345 -154.9752   True
  Chicano/Latino           White  -64.6084 0.0065 -117.3376  -11.8793   True
   International           White  145.5517    0.0   90.4798  200.6235   True
----------------------------------------------------------------------------
In [720]:
## DIFFERENCE IN GPA
e_list = ['Asian', 'Chicano/Latino', 'International', 'White']
tukey_grab(final_frame, 'GPA')
         Multiple Comparison of Means - Tukey HSD, FWER=0.05          
======================================================================
    group1         group2     meandiff p-adj   lower    upper   reject
----------------------------------------------------------------------
         Asian Chicano/Latino -34.5754    0.0 -52.1088 -17.0421   True
         Asian  International   2.7212 0.9979 -36.3571  41.7994  False
         Asian          White  -3.2038 0.9734 -22.6995  16.2918  False
Chicano/Latino  International  37.2966 0.0544  -0.4831  75.0762  False
Chicano/Latino          White  31.3716    0.0  14.6303  48.1128   True
 International          White   -5.925 0.9783 -44.6544  32.8044  False
----------------------------------------------------------------------
In [736]:
## FOR NORMALITY TEST OF TRANSFORMED FRAME
# If we were to lose a portion of the data, we would get normality in all except for African American
minion_frame = adm_frame[adm_frame.Rate > 0.20]
e_list = ['African American', 'American Indian', 'Asian', 'Chicano/Latino', 'International', 'White']
for i in e_list:
    print(i, shapiro(minion_frame[minion_frame.Eth == i].Rate),'\n')
African American ShapiroResult(statistic=0.9744502305984497, pvalue=0.06744244694709778) 

American Indian ShapiroResult(statistic=0.9111583232879639, pvalue=0.03235141932964325) 

Asian ShapiroResult(statistic=0.943628191947937, pvalue=0.0003231678856536746) 

Chicano/Latino ShapiroResult(statistic=0.9594532251358032, pvalue=0.0020372187718749046) 

International ShapiroResult(statistic=0.9388893246650696, pvalue=0.0021637685131281614) 

White ShapiroResult(statistic=0.9669060707092285, pvalue=0.009036525152623653) 

In [737]:
stats.kruskal(minion_frame[minion_frame.Eth == 'African American'].Rate,
              minion_frame[minion_frame.Eth == 'American Indian'].Rate,
              minion_frame[minion_frame.Eth == 'Asian'].Rate,
             minion_frame[minion_frame.Eth == 'Chicano/Latino'].Rate,
             minion_frame[minion_frame.Eth == 'International'].Rate,
             minion_frame[minion_frame.Eth == 'White'].Rate)
Out[737]:
KruskalResult(statistic=70.32613987310312, pvalue=8.765120982874429e-14)
In [738]:
get_val(minion_frame) # Means look much more similar
Value          15.543478
Rate            0.667405
Binary Rate     0.902174
dtype: float64
Value          4.880000
Rate           0.805571
Binary Rate    1.000000
dtype: float64
Value          74.280000
Rate            0.743809
Binary Rate     0.980000
dtype: float64
Value          76.218182
Rate            0.739485
Binary Rate     1.000000
dtype: float64
Value          45.391304
Rate            0.812597
Binary Rate     0.985507
dtype: float64
Value          69.336449
Rate            0.741762
Binary Rate     0.990654
dtype: float64
In [367]:
# Logistic Regression
from sklearn.model_selection import train_test_split

train, test = train_test_split(adm_frame, stratify = adm_frame['Eth'], train_size = .70)
train.head()
Out[367]:
School County City Type Eth Value Rate
505 MONTEREY PENINSULA COLLEGE Monterey Monterey Adm American Indian 5 0.833333
181 COLUMBIA COLLEGE Tuolumne Sonora Adm White 17 0.708333
406 LOS ANGELES HARBOR COLLEGE Los Angeles Wilmington Adm Asian 4 0.333333
771 VICTOR VALLEY COLLEGE San Bernardino Victorville Adm Chicano/Latino 61 0.717647
513 MOORPARK COLLEGE Ventura Moorpark Adm American Indian 4 0.666667
In [368]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(train[['Rate']], train['Eth'])
Out[368]:
LogisticRegression()
In [369]:
# Our coefficients
test.copy()
test['Predicted'] = lr.predict(test[['Rate']])
lr.coef_, lr.intercept_
Out[369]:
(array([[-0.33648891],
        [-2.28982245],
        [ 0.481722  ],
        [ 1.43885032],
        [-0.26759658],
        [ 0.97333563]]),
 array([ 0.34549583,  0.96903151, -0.15688824, -0.81784256,  0.1501606 ,
        -0.48995715]))
In [370]:
coef = np.vstack((lr.coef_.T, lr.intercept_))
coef
Out[370]:
array([[-0.33648891, -2.28982245,  0.481722  ,  1.43885032, -0.26759658,
         0.97333563],
       [ 0.34549583,  0.96903151, -0.15688824, -0.81784256,  0.1501606 ,
        -0.48995715]])
In [371]:
# Sigmoid function to make scores more legible, and easy to understand
def sigmoid(x):
    return 1 / (1 + np.exp(-x))
scores = sigmoid(test.iloc[:, 5:7] @ coef[:, :7])
scores = scores.set_axis([c+"-score" for c in lr.classes_],axis = 1)
/Users/fish/opt/anaconda3/lib/python3.9/site-packages/pandas/core/internals/blocks.py:402: RuntimeWarning:

overflow encountered in exp

In [372]:
log_frame = pd.concat((test,scores), axis = 1)
log_frame.head(5)
Out[372]:
School County City Type Eth Value Rate Predicted African American-score American Indian-score Asian-score Chicano/Latino-score International-score White-score
137 COLLEGE OF SAN MATEO San Mateo San Mateo Adm International 117 0.866667 Chicano/Latino 1.076994e-17 1.030940e-116 1.000000 1.000000 2.879251e-14 1.000000
8 AMERICAN RIVER COLLEGE Sacramento Sacramento Adm International 9 0.900000 Chicano/Latino 6.195126e-02 2.683152e-09 0.985143 0.999995 9.336489e-02 0.999756
668 SAN JOAQUIN DELTA COLLEGE San Joaquin Stockton Adm African American 11 0.687500 White 3.035851e-02 2.240274e-11 0.994465 1.000000 5.518349e-02 0.999969
366 LAKE TAHOE COMMUNITY COLLEGE El Dorado South Lake Tahoe Adm Asian 7 0.636364 Asian 1.056882e-01 2.025861e-07 0.963463 0.999929 1.445973e-01 0.998501
336 GROSSMONT CMTY COLLEGE San Diego El Cajon Adm Asian 35 0.729167 Chicano/Latino 9.878028e-06 3.168562e-35 1.000000 1.000000 9.549050e-05 1.000000
In [373]:
# The probability that the predicted ethnicity was equal to the actual
len(log_frame['Eth']==log_frame['Predicted'])
(sum(log_frame['Eth'] == log_frame['Predicted']))/len(log_frame)
Out[373]:
0.3225806451612903
In [428]:
# The counts of the ethnicities correctly predicted
log_frame[log_frame['Eth'] == log_frame['Predicted']].Eth.value_counts()
Out[428]:
Chicano/Latino     33
American Indian    12
Name: Eth, dtype: int64
In [534]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
## African American as the reference variable
# Summary table for significant variables
formula = "Rate ~ C(Eth)"
log_reg = smf.logit(formula, data=adm_frame).fit() 
print(log_reg.params) 
Optimization terminated successfully.
         Current function value: 0.586023
         Iterations 5
                           Logit Regression Results                           
==============================================================================
Dep. Variable:                   Rate   No. Observations:                  619
Model:                          Logit   Df Residuals:                      613
Method:                           MLE   Df Model:                            5
Date:                Mon, 05 Dec 2022   Pseudo R-squ.:                 0.08705
Time:                        23:39:24   Log-Likelihood:                -362.75
converged:                       True   LL-Null:                       -397.33
Covariance Type:            nonrobust   LLR p-value:                 1.523e-13
=============================================================================================
                                coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------
Intercept                     0.2338      0.192      1.218      0.223      -0.142       0.610
C(Eth)[T.American Indian]    -1.3232      0.321     -4.118      0.000      -1.953      -0.693
C(Eth)[T.Asian]               0.4479      0.277      1.615      0.106      -0.096       0.991
C(Eth)[T.Chicano/Latino]      0.7752      0.288      2.693      0.007       0.211       1.339
C(Eth)[T.International]       0.1570      0.285      0.551      0.581      -0.401       0.715
C(Eth)[T.White]               0.6550      0.283      2.314      0.021       0.100       1.210
=============================================================================================
Intercept                    0.233832
C(Eth)[T.American Indian]   -1.323180
C(Eth)[T.Asian]              0.447854
C(Eth)[T.Chicano/Latino]     0.775162
C(Eth)[T.International]      0.156990
C(Eth)[T.White]              0.654991
dtype: float64
In [117]:
import numpy as np
# Our odds ratios
odds_ratios = pd.DataFrame(
    {
        "OR": log_reg.params,
        "Lower CI": log_reg.conf_int()[0],
        "Upper CI": log_reg.conf_int()[1],
    }
)
odds_ratios = np.exp(odds_ratios)
print(odds_ratios)
                                 OR  Lower CI  Upper CI
Intercept                  1.263432  0.867208  1.840688
C(Eth)[T.American Indian]  0.266287  0.141865  0.499835
C(Eth)[T.Asian]            1.564951  0.908806  2.694822
C(Eth)[T.Chicano/Latino]   2.170943  1.234819  3.816749
C(Eth)[T.International]    1.169984  0.669624  2.044225
C(Eth)[T.White]            1.925126  1.105455  3.352565
In [513]:
# DECISION TREE 
# Must encode as eth is categorical
adm_frame['Binary Rate'] = np.where(adm_frame['Rate'] >= 0.5, 1, 0)
adm_frame.head(5)
ohe = pd.get_dummies(data=adm_frame, columns=['Eth']) # Need to get dummy variables
ohe_frame = ohe 
ohe_frame.head(3) # We can see that now eth is encoded
In [431]:
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn.tree import DecisionTreeRegressor 
from matplotlib import pyplot
In [530]:
## ETH AS X, RATE AS Y
X = ohe_frame.iloc[:, 7:14] 
y = ohe_frame['Binary Rate'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
In [531]:
# TRYING DECISION TREE REGRESSOR ON IT
clf = DecisionTreeRegressor()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)
In [532]:
model = DecisionTreeRegressor()
# fit the model
model.fit(X, y)
importance = model.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
pyplot.bar([x for x in range(len(importance))], importance)
plt.xlabel('Features') 
plt.ylabel('Score') 
pyplot.title("Feature Importance")
pyplot.show()

# From here, we get the same conclusions as before. Chicano/Latino and American Indian were most important.
Feature: 0, Score: 0.00183
Feature: 1, Score: 0.78606
Feature: 2, Score: 0.04898
Feature: 3, Score: 0.08575
Feature: 4, Score: 0.00000
Feature: 5, Score: 0.07739
In [518]:
from sklearn import tree
tree.plot_tree(clf)
# Tree plot
Out[518]:
[Text(0.75, 0.9166666666666666, 'X[1] <= 0.5\nsquared_error = 0.097\nsamples = 433\nvalue = 0.597'),
 Text(0.625, 0.75, 'X[3] <= 0.5\nsquared_error = 0.067\nsamples = 372\nvalue = 0.651'),
 Text(0.5, 0.5833333333333334, 'X[5] <= 0.5\nsquared_error = 0.081\nsamples = 291\nvalue = 0.625'),
 Text(0.375, 0.4166666666666667, 'X[2] <= 0.5\nsquared_error = 0.093\nsamples = 215\nvalue = 0.601'),
 Text(0.25, 0.25, 'X[4] <= 0.5\nsquared_error = 0.107\nsamples = 133\nvalue = 0.574'),
 Text(0.125, 0.08333333333333333, 'squared_error = 0.071\nsamples = 69\nvalue = 0.553'),
 Text(0.375, 0.08333333333333333, 'squared_error = 0.145\nsamples = 64\nvalue = 0.597'),
 Text(0.5, 0.25, 'squared_error = 0.068\nsamples = 82\nvalue = 0.645'),
 Text(0.625, 0.4166666666666667, 'squared_error = 0.041\nsamples = 76\nvalue = 0.694'),
 Text(0.75, 0.5833333333333334, 'squared_error = 0.006\nsamples = 81\nvalue = 0.745'),
 Text(0.875, 0.75, 'squared_error = 0.149\nsamples = 61\nvalue = 0.266')]
In [745]:
# KNN PREDICTION
feature_cols = ['Rate']#, 'Value'
X = ohe_frame[feature_cols] 
y = ohe_frame.iloc[:, 7:14] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

knn = KNeighborsClassifier(n_neighbors=7)
  
knn.fit(X_train, y_train)

print(knn.score(X_test, y_test)) # Accuracy score
0.1827956989247312
In [748]:
y_pred = knn.predict(X_test)
cr = classification_report(y_test, y_pred)
print(cr) # American Indian and Chicano/Latino had the highest F1 scores. 
# F1 is the harmonic mean of precicion and recall, the aforementioned two had higher accuracy.
              precision    recall  f1-score   support

           0       0.50      0.22      0.31        41
           1       0.44      0.74      0.55        19
           2       0.50      0.03      0.06        30
           3       0.45      0.17      0.24        30
           4       0.75      0.10      0.18        30
           5       1.00      0.06      0.11        36

   micro avg       0.49      0.18      0.27       186
   macro avg       0.61      0.22      0.24       186
weighted avg       0.62      0.18      0.22       186
 samples avg       0.18      0.18      0.18       186

/Users/fish/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1318: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.

In [762]:
f_list
Out[762]:
['African American',
 'American Indian',
 'Asian',
 'Chicano/Latino',
 'International',
 'White',
 'micro avg',
 'macro avg',
 'weighted avg',
 'samples avg']
In [763]:
# Converting to a dataframe so we can make it look nice in plotly
report = classification_report(y_test, y_pred, output_dict = True)
knncr = pd.DataFrame(report).transpose()
extra = ['micro avg', 'macro avg', 'weighted avg', 'samples avg']
f_list = e_list + extra
knncr.insert(0, "Eth and Avg", f_list) # adding a column that displays the eth and avg for clarification
/Users/fish/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1318: UndefinedMetricWarning:

Precision and F-score are ill-defined and being set to 0.0 in samples with no predicted labels. Use `zero_division` parameter to control this behavior.

In [767]:
fig = go.Figure(data=[go.Table(
    header=dict(values=list(knncr.columns),
                fill_color='lavenderblush',
                align='left'),
    cells=dict(values=[knncr['Eth and Avg'], knncr.precision, knncr.recall, knncr['f1-score'], knncr.support],
               fill_color='lavender',
               align='left'))
])
fig.update_layout(
    title_text = "KNN Classification Report",
    title_font_size=30,
    title_x=0.5,
    font_family="Times New Roman",
    font_color="black",
    title_font_family="Times New Roman",
    title_font_color="black",
)
fig.update_traces(cells_font=dict(size = 10))
fig.show()
In [550]:
# Lets try to use a different prediction model. Let's switch them around. 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X = ohe_frame.iloc[:, 7:14] 
y = ohe_frame['Binary Rate'] 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
  
knn = KNeighborsClassifier(n_neighbors=7)
  
knn.fit(X_train, y_train)
  
print(knn.score(X_test, y_test))
0.8602150537634409
In [525]:
from sklearn.neighbors import KNeighborsClassifier

#Setup arrays to store training and test accuracies
neighbors = np.arange(1,7)
train_accuracy =np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))

for i,k in enumerate(neighbors):
    #Setup a knn classifier with k neighbors
    knn = KNeighborsClassifier(n_neighbors=k)
    
    #Fit the model
    knn.fit(X_train, y_train)
    
    #Compute accuracy on the training set
    train_accuracy[i] = knn.score(X_train, y_train)
    
    #Compute accuracy on the test set
    test_accuracy[i] = knn.score(X_test, y_test)
In [526]:
plt.title('k-NN Varying number of neighbors')
plt.plot(neighbors, test_accuracy, label='Testing Accuracy')
plt.plot(neighbors, train_accuracy, label='Training accuracy')
plt.legend()
plt.xlabel('Number of neighbors')
plt.ylabel('Accuracy')
plt.show()
# Accurate between the two
In [528]:
from sklearn.metrics import confusion_matrix

y_pred = knn.predict(X_test)
confusion_matrix(y_test,y_pred) # True Negative 22, False Positive 13, 
                                # True Positive 60, False Negative 124
Out[528]:
array([[ 22,  13],
       [ 27, 124]])
In [529]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))
# Way higher F1 scores
              precision    recall  f1-score   support

           0       0.45      0.63      0.52        35
           1       0.91      0.82      0.86       151

    accuracy                           0.78       186
   macro avg       0.68      0.72      0.69       186
weighted avg       0.82      0.78      0.80       186

In [535]:
# Plot of ROC curve, indicating we have pretty high accuracy for lines closer to the left
y_pred_proba = knn.predict_proba(X_test)[:,1]
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)

plt.plot([0,1],[0,1],'k--')
plt.plot(fpr,tpr, label='Knn')
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('Knn(n_neighbors=7) ROC curve')
plt.show()
In [185]:
# Summary table for which vars are most significant. 
mlr = pd.get_dummies(data=final_frame, columns=['Eth'])

x = mlr[['GPA', 'Value', 'Eth_Asian','Eth_Chicano/Latino', 'Eth_International', 'Eth_White']]
y = mlr['Rate']
# with sklearn
regr = linear_model.LinearRegression()
regr.fit(x, y)

print('Intercept: \n', regr.intercept_)
print('Coefficients: \n', regr.coef_)

# with statsmodels
x = sm.add_constant(x) # adding a constant
 
model = sm.OLS(y, x).fit()
predictions = model.predict(x) 
 
print_model = model.summary()
print(print_model)
Intercept: 
 0.028055734103871077
Coefficients: 
 [ 2.11277822e-01  5.22287539e-05 -1.50052021e-02 -4.66654219e-02
  8.76753014e-02 -2.60046773e-02]
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   Rate   R-squared:                       0.307
Model:                            OLS   Adj. R-squared:                  0.274
Method:                 Least Squares   F-statistic:                     9.296
Date:                Sat, 03 Dec 2022   Prob (F-statistic):           2.36e-07
Time:                        12:54:37   Log-Likelihood:                 160.78
No. Observations:                 111   AIC:                            -309.6
Df Residuals:                     105   BIC:                            -293.3
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
======================================================================================
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                  0.0224      0.220      0.102      0.919      -0.413       0.458
GPA                    0.2113      0.078      2.719      0.008       0.057       0.365
Value               5.223e-05   5.16e-05      1.011      0.314   -5.02e-05       0.000
Eth_Asian             -0.0094      0.058     -0.163      0.871      -0.124       0.105
Eth_Chicano/Latino    -0.0411      0.051     -0.808      0.421      -0.142       0.060
Eth_International      0.0933      0.062      1.509      0.134      -0.029       0.216
Eth_White             -0.0204      0.057     -0.357      0.722      -0.134       0.093
==============================================================================
Omnibus:                       24.858   Durbin-Watson:                   2.265
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               84.237
Skew:                           0.672   Prob(JB):                     5.11e-19
Kurtosis:                       7.051   Cond. No.                     1.66e+18
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1.04e-30. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.